/*
 *  Checkpoint file descriptors
 *
 *  Copyright (C) 2008-2009 Oren Laadan
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

/* default debug level for output */
#define CKPT_DFLAG  CKPT_DFILE

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/file.h>
#include <linux/namei.h>
#include <linux/fs_struct.h>
#include <linux/fs.h>
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/dnotify.h>
#include <linux/pipe_fs_i.h>
#include <linux/syscalls.h>
#include <linux/deferqueue.h>
#include <linux/checkpoint.h>
#include <linux/eventpoll.h>
#include <linux/eventfd.h>
#include <net/sock.h>

/**************************************************************************
 * Checkpoint
 */

/**
 * ckpt_fill_fname - return pathname of a given file
 * @path: path name
 * @root: relative root
 * @buf: buffer for pathname
 * @len: buffer length (in) and pathname length (out)
 */
char *ckpt_fill_fname(struct path *path, struct path *root, char *buf, int *len)
{
	struct path tmp = *root;
	char *fname;

	BUG_ON(!buf);
	fname = __d_path(path, &tmp, buf, *len);
	if (IS_ERR(fname))
		return fname;
	*len = (buf + (*len) - fname);
	/*
	 * FIX: if __d_path() changed these, it must have stepped out of
	 * init's namespace. Since currently we require a unified namespace
	 * within the container: simply fail.
	 */
	if (tmp.mnt != root->mnt || tmp.dentry != root->dentry) {
		ckpt_debug("file %s was opened in an alien mnt_ns\n", fname);
		fname = ERR_PTR(-EBADF);
	}

	return fname;
}

/**
 * checkpoint_fname - write a file name
 * @ctx: checkpoint context
 * @path: path name
 * @root: relative root
 */
int checkpoint_fname(struct ckpt_ctx *ctx, struct path *path, struct path *root)
{
	char *buf, *fname;
	int ret, flen;

	/*
	 * FIXME: we can optimize and save memory (and storage) if we
	 * share strings (through objhash) and reference them instead
	 */

	flen = PATH_MAX;
	buf = kmalloc(flen, GFP_KERNEL);
	if (!buf)
		return -ENOMEM;

	fname = ckpt_fill_fname(path, root, buf, &flen);
	if (!IS_ERR(fname)) {
		ret = ckpt_write_obj_type(ctx, fname, flen,
					  CKPT_HDR_FILE_NAME);
	} else {
		ret = PTR_ERR(fname);
		ckpt_err(ctx, ret, "%(T)%(S)Obtain filename\n",
			 path->dentry->d_name.name);
	}

	kfree(buf);
	return ret;
}

#define CKPT_DEFAULT_FDTABLE  256		/* an initial guess */

/**
 * scan_fds - scan file table and construct array of open fds
 * @files: files_struct pointer
 * @fdtable: (output) array of open fds
 *
 * Returns the number of open fds found, and also the file table
 * array via *fdtable. The caller should free the array.
 *
 * The caller must validate the file descriptors collected in the
 * array before using them, e.g. by using fcheck_files(), in case
 * the task's fdtable changes in the meantime.
 */
static int scan_fds(struct files_struct *files, int **fdtable)
{
	struct fdtable *fdt;
	int *fds = NULL;
	int i = 0, n = 0;
	int tot = CKPT_DEFAULT_FDTABLE;

	/*
	 * We assume that all tasks possibly sharing the file table are
	 * frozen (or we are a single process and we checkpoint ourselves).
	 * Therefore, we can safely proceed after krealloc() from where we
	 * left off. Otherwise the file table may be modified by another
	 * task after we scan it. The behavior is this case is undefined,
	 * and either checkpoint or restart will likely fail.
	 */
 retry:
	fds = krealloc(fds, tot * sizeof(*fds), GFP_KERNEL);
	if (!fds)
		return -ENOMEM;

	rcu_read_lock();
	fdt = files_fdtable(files);
	for (/**/; i < fdt->max_fds; i++) {
		if (!fcheck_files(files, i))
			continue;
		if (n == tot) {
			rcu_read_unlock();
			tot *= 2;	/* won't overflow: kmalloc will fail */
			goto retry;
		}
		fds[n++] = i;
	}
	rcu_read_unlock();

	*fdtable = fds;
	return n;
}

#ifdef CONFIG_SECURITY
int checkpoint_file_security(struct ckpt_ctx *ctx, struct file *file)
{
	return security_checkpoint_obj(ctx, file->f_security,
				       CKPT_SECURITY_FILE);
}
#else
int checkpoint_file_security(struct ckpt_ctx *ctx, struct file *file)
{
	return SECURITY_CTX_NONE;
}
#endif

int checkpoint_file_common(struct ckpt_ctx *ctx, struct file *file,
			   struct ckpt_hdr_file *h)
{
	struct cred *f_cred = (struct cred *) file->f_cred;

	h->f_flags = file->f_flags;
	h->f_mode = file->f_mode;
	h->f_pos = file->f_pos;
	h->f_version = file->f_version;

	h->f_credref = checkpoint_obj(ctx, f_cred, CKPT_OBJ_CRED);
	if (h->f_credref < 0)
		return h->f_credref;

	h->f_secref = checkpoint_file_security(ctx, file);
	if (h->f_secref < 0) {
		ckpt_err(ctx, h->f_secref, "%(T)file->f_security");
		return h->f_secref;
	}

	ckpt_debug("file %s credref %d secref %d\n",
		file->f_dentry->d_name.name, h->f_credref, h->f_secref);

	/* FIX: need also file->f_owner, etc */

	return 0;
}

int generic_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
{
	struct ckpt_hdr_file_generic *h;
	int ret;

	/*
	 * FIXME: when we'll add support for unlinked files/dirs, we'll
	 * need to distinguish between unlinked filed and unlinked dirs.
	 */
	if (d_unlinked(file->f_dentry)) {
		ckpt_err(ctx, -EBADF, "%(T)%(P)Unlinked files unsupported\n",
			 file);
		return -EBADF;
	}

	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
	if (!h)
		return -ENOMEM;

	h->common.f_type = CKPT_FILE_GENERIC;

	ret = checkpoint_file_common(ctx, file, &h->common);
	if (ret < 0)
		goto out;
	ret = ckpt_write_obj(ctx, &h->common.h);
	if (ret < 0)
		goto out;
	ret = checkpoint_fname(ctx, &file->f_path, &ctx->root_fs_path);
 out:
	ckpt_hdr_put(ctx, h);
	return ret;
}
EXPORT_SYMBOL(generic_file_checkpoint);

/* checkpoint callback for file pointer */
static int checkpoint_file(struct ckpt_ctx *ctx, void *ptr)
{
	struct file *file = (struct file *) ptr;
	int ret;

	if (!file->f_op || !file->f_op->checkpoint) {
		ckpt_err(ctx, -EBADF, "%(T)%(P)%(V)f_op lacks checkpoint\n",
			       file, file->f_op);
		return -EBADF;
	}

	if (is_dnotify_attached(file)) {
		ckpt_err(ctx, -EBADF, "%(T)%(P)dnotify unsupported\n", file);
		return -EBADF;
	}

	ret = file->f_op->checkpoint(ctx, file);
	if (ret < 0)
		ckpt_err(ctx, ret, "%(T)%(P)file checkpoint failed\n", file);
	return ret;
}

/**
 * checkpoint_file_desc - dump the state of a given file descriptor
 * @ctx: checkpoint context
 * @files: files_struct pointer
 * @fd: file descriptor
 *
 * Saves the state of the file descriptor; looks up the actual file
 * pointer in the hash table, and if found saves the matching objref,
 * otherwise calls ckpt_write_file to dump the file pointer too.
 */
static int checkpoint_file_desc(struct ckpt_ctx *ctx,
				struct files_struct *files, int fd)
{
	struct ckpt_hdr_file_desc *h;
	struct file *file = NULL;
	struct fdtable *fdt;
	int objref, ret;
	int coe = 0;	/* avoid gcc warning */
	pid_t pid;

	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_DESC);
	if (!h)
		return -ENOMEM;

	rcu_read_lock();
	fdt = files_fdtable(files);
	file = fcheck_files(files, fd);
	if (file) {
		coe = FD_ISSET(fd, fdt->close_on_exec);
		get_file(file);
	}
	rcu_read_unlock();

	ret = find_locks_with_owner(file, files);
	/*
	 * find_locks_with_owner() returns an error when there
	 * are no locks found, so we *want* it to return an error
	 * code.  Its success means we have to fail the checkpoint.
	 */
	if (!ret) {
		ret = -EBADF;
		ckpt_err(ctx, ret, "%(T)fd %d has file lock or lease\n", fd);
		goto out;
	}

	/* sanity check (although this shouldn't happen) */
	ret = -EBADF;
	if (!file) {
		ckpt_err(ctx, ret, "%(T)fd %d gone?\n", fd);
		goto out;
	}

	/*
	 * TODO: Implement c/r of fowner and f_sigio.  Should be
	 * trivial, but for now we just refuse its checkpoint
	 */
	pid = f_getown(file);
	if (pid) {
		ret = -EBUSY;
		ckpt_err(ctx, ret, "%(T)fd %d has an owner (%d)\n", fd);
		goto out;
	}

	/*
	 * if seen first time, this will add 'file' to the objhash, keep
	 * a reference to it, dump its state while at it.
	 */
	objref = checkpoint_obj(ctx, file, CKPT_OBJ_FILE);
	ckpt_debug("fd %d objref %d file %p coe %d)\n", fd, objref, file, coe);
	if (objref < 0) {
		ret = objref;
		goto out;
	}

	h->fd_objref = objref;
	h->fd_descriptor = fd;
	h->fd_close_on_exec = coe;

	ret = ckpt_write_obj(ctx, &h->h);
out:
	ckpt_hdr_put(ctx, h);
	if (file)
		fput(file);
	return ret;
}

/* checkpoint callback for file table */
static int checkpoint_file_table(struct ckpt_ctx *ctx, void *ptr)
{
	struct files_struct *files = ptr;
	struct ckpt_hdr_file_table *h;
	int *fdtable = NULL;
	int nfds, n, ret;

	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE_TABLE);
	if (!h)
		return -ENOMEM;

	nfds = scan_fds(files, &fdtable);
	if (nfds < 0) {
		ret = nfds;
		goto out;
	}

	h->fdt_nfds = nfds;

	ret = ckpt_write_obj(ctx, &h->h);
	ckpt_hdr_put(ctx, h);
	if (ret < 0)
		goto out;

	ckpt_debug("nfds %d\n", nfds);
	for (n = 0; n < nfds; n++) {
		ret = checkpoint_file_desc(ctx, files, fdtable[n]);
		if (ret < 0)
			goto out;
	}

	ret = deferqueue_run(ctx->files_deferq);
	ckpt_debug("files_deferq ran %d entries\n", ret);
	if (ret > 0)
		ret = 0;
 out:
	kfree(fdtable);
	return ret;
}

/* checkpoint wrapper for file table */
int checkpoint_obj_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
{
	struct files_struct *files;
	int objref;

	files = get_files_struct(t);
	if (!files)
		return -EBUSY;
	objref = checkpoint_obj(ctx, files, CKPT_OBJ_FILE_TABLE);
	put_files_struct(files);

	return objref;
}

int checkpoint_obj_fs(struct ckpt_ctx *ctx, struct task_struct *t)
{
	struct fs_struct *fs;
	int fs_objref;

	task_lock(current);
	fs = t->fs;
	get_fs_struct(fs);
	task_unlock(current);

	fs_objref = checkpoint_obj(ctx, fs, CKPT_OBJ_FS);
	put_fs_struct(fs);

	return fs_objref;
}

/* called with fs refcount bumped so it won't disappear */
static int checkpoint_fs(struct ckpt_ctx *ctx, void *ptr)
{
	struct fs_struct *fs = ptr;
	struct ckpt_hdr_fs *h;
	struct fs_struct *fscopy;
	int ret;

	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FS);
	if (!h)
		return -ENOMEM;

	h->umask = fs->umask;

	ret = ckpt_write_obj(ctx, &h->h);
	ckpt_hdr_put(ctx, h);
	if (ret)
		return ret;

	fscopy = copy_fs_struct(fs);
	if (!fs)
		return -ENOMEM;

	ret = checkpoint_fname(ctx, &fscopy->pwd, &ctx->root_fs_path);
	if (ret < 0) {
		ckpt_err(ctx, ret, "%(T)writing path of cwd");
		goto out;
	}
	ret = checkpoint_fname(ctx, &fscopy->root, &ctx->root_fs_path);
	if (ret < 0) {
		ckpt_err(ctx, ret, "%(T)writing path of fs root");
		goto out;
	}
	ret = 0;
 out:
	free_fs_struct(fscopy);
	return ret;
}

/***********************************************************************
 * Collect
 */

int ckpt_collect_file(struct ckpt_ctx *ctx, struct file *file)
{
	int ret;

	ret = ckpt_obj_collect(ctx, file, CKPT_OBJ_FILE);
	if (ret <= 0)
		return ret;
	/* if first time for this file (ret > 0), invoke ->collect() */
	if (file->f_op->collect)
		ret = file->f_op->collect(ctx, file);
	if (ret < 0)
		ckpt_err(ctx, ret, "%(T)%(P)File collect\n", file);
	return ret;
}

static int collect_file_desc(struct ckpt_ctx *ctx,
			     struct files_struct *files, int fd)
{
	struct fdtable *fdt;
	struct file *file;
	int ret;

	rcu_read_lock();
	fdt = files_fdtable(files);
	file = fcheck_files(files, fd);
	if (file)
		get_file(file);
	rcu_read_unlock();

	if (!file) {
		ckpt_err(ctx, -EBUSY, "%(T)%(P)File removed\n", file);
		return -EBUSY;
	}

	ret = ckpt_collect_file(ctx, file);
	fput(file);

	return ret;
}

static int collect_file_table(struct ckpt_ctx *ctx, struct files_struct *files)
{
	int *fdtable;
	int nfds, n;
	int ret;

	/* if already exists (ret == 0), nothing to do */
	ret = ckpt_obj_collect(ctx, files, CKPT_OBJ_FILE_TABLE);
	if (ret <= 0)
		return ret;

	/* if first time for this file table (ret > 0), proceed inside */
	nfds = scan_fds(files, &fdtable);
	if (nfds < 0)
		return nfds;

	for (n = 0; n < nfds; n++) {
		ret = collect_file_desc(ctx, files, fdtable[n]);
		if (ret < 0)
			break;
	}

	kfree(fdtable);
	return ret;
}

int ckpt_collect_file_table(struct ckpt_ctx *ctx, struct task_struct *t)
{
	struct files_struct *files;
	int ret;

	files = get_files_struct(t);
	if (!files) {
		ckpt_err(ctx, -EBUSY, "%(T)files_struct missing\n");
		return -EBUSY;
	}
	ret = collect_file_table(ctx, files);
	put_files_struct(files);

	return ret;
}

int ckpt_collect_fs(struct ckpt_ctx *ctx, struct task_struct *t)
{
	struct fs_struct *fs;
	int ret;

	task_lock(t);
	fs = t->fs;
	get_fs_struct(fs);
	task_unlock(t);

	ret = ckpt_obj_collect(ctx, fs, CKPT_OBJ_FS);

	put_fs_struct(fs);
	return ret;
}

/**************************************************************************
 * Restart
 */

static int ckpt_read_fname(struct ckpt_ctx *ctx, char **fname)
{
	int len;

	len = ckpt_read_payload(ctx, (void **) fname,
				PATH_MAX, CKPT_HDR_FILE_NAME);
	if (len < 0)
		return len;

	(*fname)[len - 1] = '\0';	/* always play if safe */
	ckpt_debug("read filename '%s'\n", *fname);

	return len;
}

/**
 * restore_open_fname - read a file name and open a file
 * @ctx: checkpoint context
 * @flags: file flags
 */
struct file *restore_open_fname(struct ckpt_ctx *ctx, int flags)
{
	struct file *file;
	char *fname;
	int len;

	/* prevent bad input from doing bad things */
	if (flags & (O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC))
		return ERR_PTR(-EINVAL);

	len = ckpt_read_fname(ctx, &fname);
	if (len < 0)
		return ERR_PTR(len);
	ckpt_debug("fname '%s' flags %#x\n", fname, flags);

	file = filp_open(fname, flags, 0);
	kfree(fname);

	return file;
}

static int close_all_fds(struct files_struct *files)
{
	int *fdtable;
	int nfds;

	nfds = scan_fds(files, &fdtable);
	if (nfds < 0)
		return nfds;
	while (nfds--)
		sys_close(fdtable[nfds]);
	kfree(fdtable);
	return 0;
}

/**
 * attach_file - attach a lonely file ptr to a file descriptor
 * @file: lonely file pointer
 */
static int attach_file(struct file *file)
{
	int fd = get_unused_fd_flags(0);

	if (fd >= 0) {
		get_file(file);
		fsnotify_open(file);
		fd_install(fd, file);
	}
	return fd;
}

#define CKPT_SETFL_MASK  \
	(O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT | O_NOATIME)

int restore_file_common(struct ckpt_ctx *ctx, struct file *file,
			struct ckpt_hdr_file *h)
{
	fmode_t new_mode = file->f_mode;
	fmode_t saved_mode = (__force fmode_t) h->f_mode;
	int ret;
	struct cred *cred;

	/* FIX: need to restore owner etc */

	/* restore the cred */
	cred = ckpt_obj_fetch(ctx, h->f_credref, CKPT_OBJ_CRED);
	if (IS_ERR(cred))
		return PTR_ERR(cred);
	put_cred(file->f_cred);
	file->f_cred = get_cred(cred);

	ret = security_restore_obj(ctx, (void *) file, CKPT_SECURITY_FILE,
				   h->f_secref);
	if (ret < 0) {
		ckpt_err(ctx, ret, "file secref %(O)%(P)\n", h->f_secref,
			 file);
		return ret;
	}

	/* safe to set 1st arg (fd) to 0, as command is F_SETFL */
	ret = vfs_fcntl(0, F_SETFL, h->f_flags & CKPT_SETFL_MASK, file);
	if (ret < 0)
		return ret;

	/*
	 * Normally f_mode is set by open, and modified only via
	 * fcntl(), so its value now should match that at checkpoint.
	 * However, a file may be downgraded from (read-)write to
	 * read-only, e.g:
	 *  - mark_files_ro() unsets FMODE_WRITE
	 *  - nfs4_file_downgrade() too, and also sert FMODE_READ
	 * Validate the new f_mode against saved f_mode, allowing:
	 *  - new with FMODE_WRITE, saved without FMODE_WRITE
	 *  - new without FMODE_READ, saved with FMODE_READ
	 */
	if ((new_mode & FMODE_WRITE) && !(saved_mode & FMODE_WRITE)) {
		new_mode &= ~FMODE_WRITE;
		if (!(new_mode & FMODE_READ) && (saved_mode & FMODE_READ))
			new_mode |= FMODE_READ;
	}
	/* finally, at this point new mode should match saved mode */
	if (new_mode ^ saved_mode)
		return -EINVAL;

	if (file->f_mode & FMODE_LSEEK)
		ret = vfs_llseek(file, h->f_pos, SEEK_SET);

	return ret;
}

static struct file *generic_file_restore(struct ckpt_ctx *ctx,
					 struct ckpt_hdr_file *ptr)
{
	struct file *file;
	int ret;

	if (ptr->h.type != CKPT_HDR_FILE  ||
	    ptr->h.len != sizeof(*ptr) || ptr->f_type != CKPT_FILE_GENERIC)
		return ERR_PTR(-EINVAL);

	file = restore_open_fname(ctx, ptr->f_flags);
	if (IS_ERR(file))
		return file;

	ret = restore_file_common(ctx, file, ptr);
	if (ret < 0) {
		fput(file);
		file = ERR_PTR(ret);
	}
	return file;
}

struct restore_file_ops {
	char *file_name;
	enum file_type file_type;
	struct file * (*restore) (struct ckpt_ctx *ctx,
				  struct ckpt_hdr_file *ptr);
};

static struct restore_file_ops restore_file_ops[] = {
	/* ignored file */
	{
		.file_name = "IGNORE",
		.file_type = CKPT_FILE_IGNORE,
		.restore = NULL,
	},
	/* regular file/directory */
	{
		.file_name = "GENERIC",
		.file_type = CKPT_FILE_GENERIC,
		.restore = generic_file_restore,
	},
	/* pipes */
	{
		.file_name = "PIPE",
		.file_type = CKPT_FILE_PIPE,
		.restore = pipe_file_restore,
	},
	/* fifo */
	{
		.file_name = "FIFO",
		.file_type = CKPT_FILE_FIFO,
		.restore = fifo_file_restore,
	},
	/* socket */
	{
		.file_name = "SOCKET",
		.file_type = CKPT_FILE_SOCKET,
		.restore = sock_file_restore,
	},
	/* tty */
	{
		.file_name = "TTY",
		.file_type = CKPT_FILE_TTY,
		.restore = tty_file_restore,
	},
	/* epoll */
	{
		.file_name = "EPOLL",
		.file_type = CKPT_FILE_EPOLL,
		.restore = ep_file_restore,
	},
	/* eventfd */
	{
		.file_name = "EVENTFD",
		.file_type = CKPT_FILE_EVENTFD,
		.restore = eventfd_restore,
	},
};

static void *restore_file(struct ckpt_ctx *ctx)
{
	struct restore_file_ops *ops;
	struct ckpt_hdr_file *h;
	struct file *file = ERR_PTR(-EINVAL);

	/*
	 * All 'struct ckpt_hdr_file_...' begin with ckpt_hdr_file,
	 * but the actual object depends on the file type. The length
	 * should never be more than page.
	 */
	h = ckpt_read_buf_type(ctx, PAGE_SIZE, CKPT_HDR_FILE);
	if (IS_ERR(h))
		return (void *)h;
	ckpt_debug("flags %#x mode %#x type %d\n",
		 h->f_flags, h->f_mode, h->f_type);

	if (h->f_type >= CKPT_FILE_MAX)
		goto out;

	ops = &restore_file_ops[h->f_type];
	BUG_ON(ops->file_type != h->f_type);

	if (ops->restore)
		file = ops->restore(ctx, h);
 out:
	ckpt_hdr_put(ctx, h);
	return (void *)file;
}

/**
 * restore_file_desc - restore the state of a given file descriptor
 * @ctx: checkpoint context
 *
 * Restores the state of a file descriptor; looks up the objref (in the
 * header) in the hash table, and if found picks the matching file and
 * use it; otherwise calls restore_file to restore the file too.
 */
static int restore_file_desc(struct ckpt_ctx *ctx)
{
	struct ckpt_hdr_file_desc *h;
	struct file *file;
	int newfd, ret;

	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_DESC);
	if (IS_ERR(h))
		return PTR_ERR(h);
	ckpt_debug("ref %d fd %d c.o.e %d\n",
		 h->fd_objref, h->fd_descriptor, h->fd_close_on_exec);

	ret = -EINVAL;
	if (h->fd_objref <= 0 || h->fd_descriptor < 0)
		goto out;

	file = ckpt_obj_fetch(ctx, h->fd_objref, CKPT_OBJ_FILE);
	if (IS_ERR(file)) {
		ret = PTR_ERR(file);
		goto out;
	}

	newfd = attach_file(file);
	if (newfd < 0) {
		ret = newfd;
		goto out;
	}

	ckpt_debug("newfd got %d wanted %d\n", newfd, h->fd_descriptor);

	/* reposition if newfd isn't desired fd */
	if (newfd != h->fd_descriptor) {
		ret = sys_dup2(newfd, h->fd_descriptor);
		if (ret < 0)
			goto out;
		sys_close(newfd);
	}

	set_close_on_exec(h->fd_descriptor, h->fd_close_on_exec);
	ret = 0;
 out:
	ckpt_hdr_put(ctx, h);
	return ret;
}

/* restore callback for file table */
static void *restore_file_table(struct ckpt_ctx *ctx)
{
	struct ckpt_hdr_file_table *h;
	struct files_struct *files;
	int i, ret;

	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FILE_TABLE);
	if (IS_ERR(h))
		return (void *)h;

	ckpt_debug("nfds %d\n", h->fdt_nfds);

	ret = -EMFILE;
	if (h->fdt_nfds < 0 || h->fdt_nfds > sysctl_nr_open)
		goto out;

	/*
	 * We assume that restarting tasks, as created in user-space,
	 * have distinct files_struct objects each. If not, we need to
	 * call dup_fd() to make sure we don't overwrite an already
	 * restored one.
	 */

	/* point of no return -- close all file descriptors */
	ret = close_all_fds(current->files);
	if (ret < 0)
		goto out;

	for (i = 0; i < h->fdt_nfds; i++) {
		ret = restore_file_desc(ctx);
		if (ret < 0)
			goto out;
	}

	ret = deferqueue_run(ctx->files_deferq);
	ckpt_debug("files_deferq ran %d entries\n", ret);
	if (ret > 0)
		ret = 0;
 out:
	ckpt_hdr_put(ctx, h);
	if (!ret) {
		files = current->files;
		atomic_inc(&files->count);
	} else {
		files = ERR_PTR(ret);
	}
	return (void *)files;
}

int restore_obj_file_table(struct ckpt_ctx *ctx, int files_objref)
{
	struct files_struct *files;

	files = ckpt_obj_fetch(ctx, files_objref, CKPT_OBJ_FILE_TABLE);
	if (IS_ERR(files))
		return PTR_ERR(files);

	if (files != current->files) {
		struct files_struct *prev;

		task_lock(current);
		prev = current->files;
		current->files = files;
		atomic_inc(&files->count);
		task_unlock(current);

		put_files_struct(prev);
	}

	return 0;
}

/*
 * Called by task restore code to set the restarted task's
 * current->fs to an entry on the hash
 */
int restore_obj_fs(struct ckpt_ctx *ctx, int fs_objref)
{
	struct fs_struct *newfs, *oldfs;

	newfs = ckpt_obj_fetch(ctx, fs_objref, CKPT_OBJ_FS);
	if (IS_ERR(newfs))
		return PTR_ERR(newfs);

	task_lock(current);
	get_fs_struct(newfs);
	oldfs = current->fs;
	current->fs = newfs;
	task_unlock(current);
	put_fs_struct(oldfs);

	return 0;
}

static int restore_chroot(struct ckpt_ctx *ctx, struct fs_struct *fs, char *name)
{
	struct nameidata nd;
	int ret;

	ckpt_debug("attempting chroot to %s\n", name);
	ret = path_lookup(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
	if (ret) {
		ckpt_err(ctx, ret, "%(T)Opening chroot dir %s", name);
		return ret;
	}
	ret = do_chroot(fs, &nd.path);
	path_put(&nd.path);
	if (ret) {
		ckpt_err(ctx, ret, "%(T)Setting chroot %s", name);
		return ret;
	}
	return 0;
}

static int restore_cwd(struct ckpt_ctx *ctx, struct fs_struct *fs, char *name)
{
	struct nameidata nd;
	int ret;

	ckpt_debug("attempting chdir to %s\n", name);
	ret = path_lookup(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
	if (ret) {
		ckpt_err(ctx, ret, "%(T)Opening cwd %s", name);
		return ret;
	}
	ret = do_chdir(fs, &nd.path);
	path_put(&nd.path);
	if (ret) {
		ckpt_err(ctx, ret, "%(T)Setting cwd %s", name);
		return ret;
	}
	return 0;
}

/*
 * Called by objhash when it runs into a CKPT_OBJ_FS entry. Creates
 * an fs_struct with desired chroot/cwd and places it in the hash.
 */
static void *restore_fs(struct ckpt_ctx *ctx)
{
	struct ckpt_hdr_fs *h;
	struct fs_struct *fs = NULL;
	char *path;
	int ret = 0;

	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_FS);
	if (IS_ERR(h))
		return ERR_PTR(PTR_ERR(h));

	fs = copy_fs_struct(current->fs);
	if (!fs) {
		ret = -ENOMEM;
		goto out;
	}

	fs->umask = h->umask & S_IRWXUGO;

	ret = ckpt_read_fname(ctx, &path);
	if (ret < 0)
		goto out;
	ret = restore_cwd(ctx, fs, path);
	kfree(path);
	if (ret)
		goto out;

	ret = ckpt_read_fname(ctx, &path);
	if (ret < 0)
		goto out;
	ret = restore_chroot(ctx, fs, path);
	kfree(path);

out:
	ckpt_hdr_put(ctx, h);
	if (ret) {
		if (fs)
			free_fs_struct(fs);
		return ERR_PTR(ret);
	}
	return fs;
}

/*
 * fs-related checkpoint objects
 */

static int obj_fs_grab(void *ptr)
{
	get_fs_struct((struct fs_struct *) ptr);
	return 0;
}

static void obj_fs_drop(void *ptr, int lastref)
{
	put_fs_struct((struct fs_struct *) ptr);
}

static int obj_fs_users(void *ptr)
{
	/*
	 * It's safe to not use fs->lock because the fs referenced.
	 * It's also sufficient for leak detection: with no leak the
	 * count can't change; with a leak it will be too big already
	 * (even if it's about to grow), and if it's about to shrink
	 * then it's as if we sampled the count a bit earlier.
	 */
	return ((struct fs_struct *) ptr)->users;
}

static int obj_file_table_grab(void *ptr)
{
	atomic_inc(&((struct files_struct *) ptr)->count);
	return 0;
}

static void obj_file_table_drop(void *ptr, int lastref)
{
	put_files_struct((struct files_struct *) ptr);
}

static int obj_file_table_users(void *ptr)
{
	return atomic_read(&((struct files_struct *) ptr)->count);
}

static int obj_file_grab(void *ptr)
{
	get_file((struct file *) ptr);
	return 0;
}

static void obj_file_drop(void *ptr, int lastref)
{
	fput((struct file *) ptr);
}

static int obj_file_users(void *ptr)
{
	return atomic_long_read(&((struct file *) ptr)->f_count);
}

/* fs object */
static const struct ckpt_obj_ops ckpt_obj_fs_ops = {
	.obj_name = "FS",
	.obj_type = CKPT_OBJ_FS,
	.ref_drop = obj_fs_drop,
	.ref_grab = obj_fs_grab,
	.ref_users = obj_fs_users,
	.checkpoint = checkpoint_fs,
	.restore = restore_fs,
};

/* files_struct object */
static const struct ckpt_obj_ops ckpt_obj_files_struct_ops = {
	.obj_name = "FILE_TABLE",
	.obj_type = CKPT_OBJ_FILE_TABLE,
	.ref_drop = obj_file_table_drop,
	.ref_grab = obj_file_table_grab,
	.ref_users = obj_file_table_users,
	.checkpoint = checkpoint_file_table,
	.restore = restore_file_table,
};

/* file object */
static const struct ckpt_obj_ops ckpt_obj_file_ops = {
	.obj_name = "FILE",
	.obj_type = CKPT_OBJ_FILE,
	.ref_drop = obj_file_drop,
	.ref_grab = obj_file_grab,
	.ref_users = obj_file_users,
	.checkpoint = checkpoint_file,
	.restore = restore_file,
};

static __init int checkpoint_register_fs(void)
{
	int ret;

	ret = register_checkpoint_obj(&ckpt_obj_fs_ops);
	if (ret < 0)
		return ret;
	ret = register_checkpoint_obj(&ckpt_obj_files_struct_ops);
	if (ret < 0)
		return ret;
	ret = register_checkpoint_obj(&ckpt_obj_file_ops);
	if (ret < 0)
		return ret;
	return 0;
}
late_initcall(checkpoint_register_fs);
